# load libraries
library(quanteda)
library(readtext)
Registered S3 method overwritten by 'data.table':
method from
print.data.table
library(wordcloud)
Lade nötiges Paket: RColorBrewer
library(RColorBrewer)
library(wordcloud2)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ purrr 0.3.4
✓ tibble 3.1.2 ✓ dplyr 1.0.7
✓ tidyr 1.1.3 ✓ stringr 1.4.0
✓ readr 2.0.0 ✓ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(tm)
Lade nötiges Paket: NLP
Attache Paket: ‘NLP’
Das folgende Objekt ist maskiert ‘package:ggplot2’:
annotate
Die folgenden Objekte sind maskiert von ‘package:quanteda’:
meta, meta<-
Attache Paket: ‘tm’
Das folgende Objekt ist maskiert ‘package:quanteda’:
stopwords
library("textcat")
library("quanteda.textplots")
library("quanteda.textstats")
library("gsubfn")
Lade nötiges Paket: proto
Warnung in doTryCatch(return(expr), name, parentenv, handler)
kann shared object '/Library/Frameworks/R.framework/Resources/modules//R_X11.so' nicht laden:
dlopen(/Library/Frameworks/R.framework/Resources/modules//R_X11.so, 6): Library not loaded: /opt/X11/lib/libSM.6.dylib
Referenced from: /Library/Frameworks/R.framework/Versions/4.1/Resources/modules/R_X11.so
Reason: image not found
Could not load tcltk. Will use slower R code instead.
library("spacyr")
spacy_initialize(model = "de_core_news_sm")
Found 'spacy_condaenv'. spacyr will use this environment
successfully initialized (spaCy Version: 3.1.1, language model: de_core_news_sm)
(python options: type = "condaenv", value = "spacy_condaenv")
# load corpus files
full_corpus = readRDS("corpora/full_corpus.rds")
full_corpus_sents = readRDS("corpora/full_corpus_sents.rds")
pro_corpus = readRDS("corpora/pro_corpus.rds")
contra_corpus = readRDS("corpora/contra_corpus.rds")
pro2000 = readRDS("corpora/pro2000.rds")
pro900 = readRDS("corpora/pro900.rds")
contra2000 = readRDS("corpora/contra2000.rds")
contra900 = readRDS("corpora/contra900.rds")
fff_de_corpus = readRDS("corpora/fff_de_corpus.rds")
ikem_corpus = readRDS("corpora/ikem_corpus.rds")
klimarep_corpus = readRDS("corpora/klimarep_corpus.rds")
klimafakten_corpus = readRDS("corpora/klimafakten_corpus.rds")
zero_corpus = readRDS("corpora/zero_corpus.rds")
komma_corpus = readRDS("corpora/komma_corpus.rds")
eike_corpus = readRDS("corpora/eike_corpus.rds")
ffh_corpus = readRDS("corpora/ffh_corpus.rds")
summary(pro2000, n = 10)
Corpus consisting of 2000 documents, showing 10 documents:
Text Types Tokens Sentences origin language group id
kr_00007.txt 196 284 19 kr german activists 1
ikem_01141.txt 27 30 2 ikem german activists 2
fff_de_00121.txt 105 178 11 fff_de german activists 3
ikem_00898.txt 210 869 32 ikem german activists 4
ikem_00709.txt 20 43 3 ikem german activists 5
ikem_00588.txt 209 573 11 ikem german activists 6
kr_00036.txt 415 695 30 kr german activists 7
ikem_00627.txt 353 1003 23 ikem german activists 8
ikem_00532.txt 277 530 29 ikem german activists 9
ikem_00518.txt 140 233 6 ikem german activists 10
summary(contra2000, n=10)
Corpus consisting of 2000 documents, showing 10 documents:
Text Types Tokens Sentences origin language group id
eike_05241.txt 484 943 36 eike german sceptics 1
eike_07670.txt 67 79 8 eike german sceptics 2
eike_12660.txt 24 30 2 eike german sceptics 3
eike_13073.txt 96 135 4 eike german sceptics 4
eike_09381.txt 22 27 2 eike german sceptics 5
eike_05930.txt 101 152 6 eike german sceptics 6
eike_02092.txt 1412 3659 173 eike german sceptics 7
eike_03703.txt 1645 4420 187 eike german sceptics 8
eike_12814.txt 21 26 2 eike german sceptics 9
eike_04522.txt 1050 2511 128 eike german sceptics 10
# to get index number
#id_pro = 1:ndoc(pro900)
contra2000_sum <- summary(contra2000)
pro2000_sum <- summary(pro2000)
pro900_sum <- summary(pro900, ndoc(pro900))
contra900_sum <- summary(contra900, ndoc(contra900))
#to get id as x axis
#id_pro[1:100]
#contra900_sum$id
ggplot(pro2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Pro2000")
ggplot(contra2000_sum, aes(id, Sentences, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Sentences Contra2000")
pro900_sum <- summary(pro900, n=100)
contra900_sum <- summary(contra900, n=100)
ggplot(pro900_sum, aes(pro900_sum$id, Tokens, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Tokens Pro900")
ggplot(contra900_sum, aes(contra900_sum$id, Tokens, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Tokens Contra900")
ggplot(pro900_sum, aes(pro900_sum$id, Types, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Types Pro900")
ggplot(contra900_sum, aes(contra900_sum$id, Types, group=1)) +
geom_line() +
geom_point() +
theme(axis.text.x = element_text(angle=0, vjust=1, hjust=1)) +
ggtitle("Types Contra900")
NA
NA
ggplot(pro900_sum, aes(Tokens, Types, group=1, label= id)) +
geom_smooth(method = "lm", formula ="y ~ x", se = FALSE) +
geom_text(check_overlap = T) +
ggtitle("Type-Token-Relation Pro900")
ggplot(contra900_sum, aes(Tokens, Types, group=1, label= id)) +
geom_smooth(method = "lm", formula ="y ~ x", se = FALSE) +
geom_text(check_overlap = T) +
ggtitle("Type-Token-Relation Contra900")
<<<<<<< Updated upstream # Word Frequencies
# stoplists
de_stopwords <- stopwords::stopwords("de", source="snowball")
en_stopwords <- stopwords::stopwords("en", source="snowball" )
custom_stopwords <- read.table("de_complete.txt", header=F, sep="\n")
# add own stopwords
full_stopwords <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "|", "m", "kommentare", "neueste", "gepostet", custom_stopwords, en_stopwords)
de_stopwords1 <- c(de_stopwords, "dass", "=", "the", "seit", "ab", "beim", "\n", "mal", "c", "\\|","|", "m", "kommentare", "neueste", "gepostet", "admin", "cookies", "inhalte", "inhalt", "newsletter", "posten", "zugriff", "passwort", "geschützt", "seite", "website", "webseite", "and", "0", "1", "2", "3","4","5","6","7","8","9", "mfg","w","t","wer")
# create dfm
dfm_p2000 <- dfm(pro2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
dfm_c2000 <- dfm(contra2000, remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# pro
sp_pro2000 <- spacy_parse(pro2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(pro2000, pos = FALSE, entity = FALSE, dependency = FALSE)
lemmatization may not work properly in model 'de_core_news_sm'
sp_pro2000$token <- sp_pro2000$lemma
sp_dfm_p2000 <- as.tokens(sp_pro2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
# contra
sp_contra2000 <- spacy_parse(contra2000, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(contra2000, pos = FALSE, entity = FALSE,
lemmatization may not work properly in model 'de_core_news_sm'
sp_contra2000$token <- sp_contra2000$lemma
sp_dfm_c2000 <- as.tokens(sp_contra2000)%>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
sp_full <- spacy_parse(full_corpus, pos=FALSE, entity=FALSE, dependency=FALSE)
Warnung in spacy_parse.character(full_corpus, pos = FALSE, entity = FALSE,
lemmatization may not work properly in model 'de_core_news_sm'
sp_full$token <- sp_full$lemma
sp_dfm_full <- as.tokens(sp_full) %>%
dfm(remove=full_stopwords, remove_punct=TRUE, remove_numbers=TRUE, tolower=TRUE)
Warnung: '...' should not be used for tokens() arguments; use 'tokens()' first.
Warnung: 'remove' is deprecated; use dfm_remove() instead
dfm_p2000
Document-feature matrix of: 2,000 documents, 60,243 features (99.72% sparse) and 4 docvars.
features
docs klima update ° folge extremwetter eu-klimaplan versicherungswende wütet weltweit eu-kommission
kr_00007.txt 3 3 3 1 3 1 1 2 2 1
ikem_01141.txt 0 0 0 0 0 0 0 0 0 0
fff_de_00121.txt 0 0 0 0 0 0 0 0 0 0
ikem_00898.txt 0 0 0 0 0 0 0 0 0 0
ikem_00709.txt 0 0 0 0 0 0 0 0 0 0
ikem_00588.txt 0 0 0 0 0 0 0 0 0 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 60,233 more features ]
dfm_c2000
Document-feature matrix of: 2,000 documents, 150,974 features (99.71% sparse) and 4 docvars.
features
docs interne ermittler schule glauben ehemalige labortechnikerin erin potts-kant daten medizinische
eike_05241.txt 1 1 2 1 2 1 1 6 7 1
eike_07670.txt 0 0 0 0 0 0 0 0 0 0
eike_12660.txt 0 0 0 0 0 0 0 0 0 0
eike_13073.txt 0 0 0 0 0 0 0 0 0 0
eike_09381.txt 0 0 0 0 0 0 0 0 0 0
eike_05930.txt 0 0 0 0 0 0 0 0 0 0
[ reached max_ndoc ... 1,994 more documents, reached max_nfeat ... 150,964 more features ]
topfeatures(sp_dfm_p2000, n=50)
mehr uhr ikem mensch jahr weit deutschland geben energie
4283 3497 2657 2464 2062 1698 1645 1602 1592
thema aktuell sollen groß gehen klimaschutz gut sowie immer
1402 1362 1323 1319 1301 1289 1276 1207 1156
bleiben information müssen schon dabei future arbeit stehen energiewende
1152 1151 1144 1084 1020 992 977 926 901
politik finden welch land projekt ziel wichtig berlin kommen
895 891 886 876 869 863 851 846 838
fridays erfahren jed zukunft klimakrise newsletter ganz neu rahmen
831 825 816 813 796 780 769 750 748
laufende möglich erst anmelden frage
741 735 721 720 717
topfeatures(sp_dfm_c2000, n=50)
jahr geben mehr schon immer co2 gut kommen ja gehen
9197 6646 6328 4722 4289 4114 4067 4056 3951 3821
weit deutschland sollen mensch sagen groß welch jed zeigen energie
3786 3643 3640 3559 3448 3378 3332 3286 3102 2921
hoch wenig ganz global sehen müssen temperatur strom herr natürlich
2883 2869 2834 2523 2503 2480 2423 2399 2381 2236
stehen einfach ° zeit klima erst finden heute frage land
2234 2226 2221 2215 2196 2154 2121 2064 2048 2001
klimawandel liegen etwa erwärmung genau wissen erde welt atmosphäre tun
1944 1912 1901 1856 1852 1852 1824 1822 1808 1804
tf_p2000 <- topfeatures(sp_dfm_p2000, n=50)
tf_c2000 <- topfeatures(sp_dfm_c2000, n=50)
textstat_frequency(sp_dfm_p2000, n=50)
klima_p2000 <- dfm_select(sp_dfm_p2000, pattern="klima*")
klima_c2000 <- dfm_select(sp_dfm_c2000, pattern="klima*")
klima_terms_p2000 <- topfeatures(klima_p2000, n=100)
klima_terms_c2000 <- topfeatures(klima_c2000, n=100)
freq_p2000 <- textstat_frequency(sp_dfm_p2000, n=50)
freq_c2000 <- textstat_frequency(sp_dfm_c2000, n=50)
plot_p2000 <- with(freq_p2000, reorder(feature, -frequency))
plot_c2000 <- with(freq_c2000, reorder(feature, -frequency))
#create plot for eike klima words frequencies
plot1 <- ggplot(freq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot1
#create plot for klimareporter klima words frequencies
plot2 <- ggplot(freq_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot2, width = 10, height = 5, dpi=300, filename="klima_klimarep_plot.jpeg" )
plot2
freq_klima_p2000 <- textstat_frequency(klima_p2000, n=50)
freq_klima_c2000 <- textstat_frequency(klima_c2000, n=50)
freq_klima_p2000$feature <- with(freq_klima_p2000, reorder(feature, -frequency))
freq_klima_c2000$feature <- with(freq_klima_c2000, reorder(feature, -frequency))
#create plot for eike klima words frequencies
plot1 <- ggplot(freq_klima_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Klimawörter Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot1
#create plot for klimareporter klima words frequencies
plot2 <- ggplot(freq_klima_c2000, aes(x=feature, y=frequency)) +
geom_point()+ ggtitle("C2000 Klimawörter Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot2, width = 10, height = 5, dpi=300, filename="klima_klimarep_plot.jpeg" )
plot2
# to save lists of klima-words
# w/o "$feature" it saves the whole table as text file (with frequency info etc.)
#capture.output(list(freq_klima_p2000$feature), file = "terms_pro.txt")
#capture.output(list(freq_klima_c2000$feature), file = "terms_contra.txt")
# weighted words
p2000_weight <- dfm_weight(sp_dfm_p2000, scheme="prop")
c2000_weight <- dfm_weight(sp_dfm_c2000, scheme="prop")
relfreq_p2000 <- textstat_frequency(p2000_weight, n=50)
relfreq_c2000 <- textstat_frequency(c2000_weight, n=50)
#tfidf
p2000_tfidf <- dfm_tfidf(sp_dfm_p2000, scheme_tf = "prop")
c2000_tfidf <- dfm_tfidf(sp_dfm_c2000, scheme_tf = "prop")
#plot3 <- with(relfreq_p2000, reorder(feature, -freqency))
relfreq_p2000$feature <- with(relfreq_p2000, reorder(feature, -frequency))
plot3 <- ggplot(relfreq_p2000, aes(x=feature, y=frequency)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot3
pro_freq_tfidf <- p2000_tfidf %>%
textstat_frequency(n=10, force=TRUE)
con_freq_tfidf <- c2000_tfidf %>%
textstat_frequency(n=10, force=TRUE)
tplot_tfidf_p2000 <- ggplot(data=pro_freq_tfidf,
aes(x=factor(nrow(pro_freq_tfidf):1),
y=frequency)) +
geom_point() +
coord_flip() +
scale_x_discrete(breaks=factor(nrow(pro_freq_tfidf):1),
labels=pro_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_p2000
tplot_tfidf_c2000 <- ggplot(data=con_freq_tfidf,
aes(x=factor(nrow(con_freq_tfidf):1),
y=frequency)) +
geom_point() +
coord_flip() +
scale_x_discrete(breaks=factor(nrow(con_freq_tfidf):1),
labels=con_freq_tfidf$feature) +
labs(x=NULL, y="tf-idf")
tplot_tfidf_c2000
topfeatures(p2000_tfidf, n=20)
mehr klimapolitische anmelden newsletter laufende erfahren arbeit information
21.685082 21.487854 21.303763 21.163076 20.840390 20.185371 17.596305 17.068803
bleiben thema aktuell ikem mensch cookies website veränderung
16.172993 14.977431 14.130058 13.211684 12.981626 11.033872 5.354600 5.216797
uhr stehen disclosure unterstützen
4.143985 3.929530 3.847595 3.835239
topfeatures(c2000_tfidf, n=20)
... admin kommentar klima energie axel göhring jul robert frey chris feb
25.995707 20.833266 13.313344 12.561506 11.308458 11.140143 11.094183 10.416316 10.078114 9.249402 9.229529 8.941108
apr okt jan dez jun sep mrz demmig
8.725734 8.553627 8.532355 7.220623 7.057978 6.865806 6.737388 6.720881
#p2000_weight
#textstat_frequency(p2000_tfidf, n=10)
dfm_klima
Document-feature matrix of: 4,000 documents, 2,575 features (99.86% sparse) and 3 docvars.
features
docs klima klimaschutzgesetze klimawandel klimafreundlich klimazoll klimaschutz klimaschützer:innen
kr_00007.txt 0.02142857 0.007142857 0.007142857 0.007142857 0.007142857 0.007142857 0.007142857
ikem_01141.txt 0 0 0 0 0 0 0
fff_de_00121.txt 0 0 0 0 0 0 0
ikem_00898.txt 0 0 0 0 0 0 0
ikem_00709.txt 0 0 0 0 0 0 0
ikem_00588.txt 0 0 0 0 0 0.003424658 0
features
docs klimaneutralität klimapolitische klimastreik
kr_00007.txt 0.007142857 0 0
ikem_01141.txt 0 0.07142857 0
fff_de_00121.txt 0 0 0.01724138
ikem_00898.txt 0 0 0
ikem_00709.txt 0 0 0
ikem_00588.txt 0 0 0
[ reached max_ndoc ... 3,994 more documents, reached max_nfeat ... 2,565 more features ]
# to remove special chars from corpus
pro2000 <- gsub("\\|", "", pro2000)
pro2000 <- gsub("=", "", pro2000)
# convert into collocation dataframe
p_coll <- textstat_collocations(pro2000, min_count=50)
arrange(p_coll, desc(count))
de_stps
[1] "aber\\b|\\balle\\b|\\ballem\\b|\\ballen\\b|\\baller\\b|\\balles\\b|\\bals\\b|\\balso\\b|\\bam\\b|\\ban\\b|\\bander\\b|\\bandere\\b|\\banderem\\b|\\banderen\\b|\\banderer\\b|\\banderes\\b|\\banderm\\b|\\bandern\\b|\\banderr\\b|\\banders\\b|\\bauch\\b|\\bauf\\b|\\baus\\b|\\bbei\\b|\\bbin\\b|\\bbis\\b|\\bbist\\b|\\bda\\b|\\bdamit\\b|\\bdann\\b|\\bder\\b|\\bden\\b|\\bdes\\b|\\bdem\\b|\\bdie\\b|\\bdas\\b|\\bdaß\\b|\\bderselbe\\b|\\bderselben\\b|\\bdenselben\\b|\\bdesselben\\b|\\bdemselben\\b|\\bdieselbe\\b|\\bdieselben\\b|\\bdasselbe\\b|\\bdazu\\b|\\bdein\\b|\\bdeine\\b|\\bdeinem\\b|\\bdeinen\\b|\\bdeiner\\b|\\bdeines\\b|\\bdenn\\b|\\bderer\\b|\\bdessen\\b|\\bdich\\b|\\bdir\\b|\\bdu\\b|\\bdies\\b|\\bdiese\\b|\\bdiesem\\b|\\bdiesen\\b|\\bdieser\\b|\\bdieses\\b|\\bdoch\\b|\\bdort\\b|\\bdurch\\b|\\bein\\b|\\beine\\b|\\beinem\\b|\\beinen\\b|\\beiner\\b|\\beines\\b|\\beinig\\b|\\beinige\\b|\\beinigem\\b|\\beinigen\\b|\\beiniger\\b|\\beiniges\\b|\\beinmal\\b|\\ber\\b|\\bihn\\b|\\bihm\\b|\\bes\\b|\\betwas\\b|\\beuer\\b|\\beure\\b|\\beurem\\b|\\beuren\\b|\\beurer\\b|\\beures\\b|\\bfür\\b|\\bgegen\\b|\\bgewesen\\b|\\bhab\\b|\\bhabe\\b|\\bhaben\\b|\\bhat\\b|\\bhatte\\b|\\bhatten\\b|\\bhier\\b|\\bhin\\b|\\bhinter\\b|\\bich\\b|\\bmich\\b|\\bmir\\b|\\bihr\\b|\\bihre\\b|\\bihrem\\b|\\bihren\\b|\\bihrer\\b|\\bihres\\b|\\beuch\\b|\\bim\\b|\\bin\\b|\\bindem\\b|\\bins\\b|\\bist\\b|\\bjede\\b|\\bjedem\\b|\\bjeden\\b|\\bjeder\\b|\\bjedes\\b|\\bjene\\b|\\bjenem\\b|\\bjenen\\b|\\bjener\\b|\\bjenes\\b|\\bjetzt\\b|\\bkann\\b|\\bkein\\b|\\bkeine\\b|\\bkeinem\\b|\\bkeinen\\b|\\bkeiner\\b|\\bkeines\\b|\\bkönnen\\b|\\bkönnte\\b|\\bmachen\\b|\\bman\\b|\\bmanche\\b|\\bmanchem\\b|\\bmanchen\\b|\\bmancher\\b|\\bmanches\\b|\\bmein\\b|\\bmeine\\b|\\bmeinem\\b|\\bmeinen\\b|\\bmeiner\\b|\\bmeines\\b|\\bmit\\b|\\bmuss\\b|\\bmusste\\b|\\bnach\\b|\\bnicht\\b|\\bnichts\\b|\\bnoch\\b|\\bnun\\b|\\bnur\\b|\\bob\\b|\\boder\\b|\\bohne\\b|\\bsehr\\b|\\bsein\\b|\\bseine\\b|\\bseinem\\b|\\bseinen\\b|\\bseiner\\b|\\bseines\\b|\\bselbst\\b|\\bsich\\b|\\bsie\\b|\\bihnen\\b|\\bsind\\b|\\bso\\b|\\bsolche\\b|\\bsolchem\\b|\\bsolchen\\b|\\bsolcher\\b|\\bsolches\\b|\\bsoll\\b|\\bsollte\\b|\\bsondern\\b|\\bsonst\\b|\\büber\\b|\\bum\\b|\\bund\\b|\\buns\\b|\\bunse\\b|\\bunsem\\b|\\bunsen\\b|\\bunser\\b|\\bunses\\b|\\bunter\\b|\\bviel\\b|\\bvom\\b|\\bvon\\b|\\bvor\\b|\\bwährend\\b|\\bwar\\b|\\bwaren\\b|\\bwarst\\b|\\bwas\\b|\\bweg\\b|\\bweil\\b|\\bweiter\\b|\\bwelche\\b|\\bwelchem\\b|\\bwelchen\\b|\\bwelcher\\b|\\bwelches\\b|\\bwenn\\b|\\bwerde\\b|\\bwerden\\b|\\bwie\\b|\\bwieder\\b|\\bwill\\b|\\bwir\\b|\\bwird\\b|\\bwirst\\b|\\bwo\\b|\\bwollen\\b|\\bwollte\\b|\\bwürde\\b|\\bwürden\\b|\\bzu\\b|\\bzum\\b|\\bzur\\b|\\bzwar\\b|\\bzwischen\\b|\\bdass\\b|\\b=\\b|\\bthe\\b|\\bseit\\b|\\bab\\b|\\bbeim\\b|\\b\n\\b|\\bmal\\b|\\bc\\b|\\b\\|\\b|\\b|\\b|\\bm\\b|\\bkommentare\\b|\\bneueste\\b|\\bgepostet\\b|\\badmin\\b|\\bcookies\\b|\\binhalte\\b|\\binhalt\\b|\\bnewsletter\\b|\\bposten\\b|\\bzugriff\\b|\\bpasswort\\b|\\bgeschützt\\b|\\bseite\\b|\\bwebsite\\b|\\bwebseite\\b|\\band\\b|\\b0\\b|\\b1\\b|\\b2\\b|\\b3\\b|\\b4\\b|\\b5\\b|\\b6\\b|\\b7\\b|\\b8\\b|\\b9\\b|\\bmfg\\b|\\bw\\b|\\bt\\b|\\bwer"
p2000_coll_clean$collocation <- with(p2000_coll_clean, reorder(collocation, -count))
plot <- ggplot(p2000_coll_clean, aes(x=collocation, y=count)) +
geom_point()+ggtitle("P2000 Frequencies")+
theme(axis.text.x = element_text(angle=90,hjust=1))
#ggsave(plot=plot1, width = 10, height = 5, dpi=300, filename="klima_eike_plot.jpeg" )
plot
# collocations mit vorherigem stopwords removal: not working
p2000_toks <- tokens(pro2000)
p2000_toks_sw <- tokens_select(p2000_toks, pattern=full_stopwords, selection="remove")
p2000_coll <- textstat_collocations(p2000_toks_sw, min_count=3)
arrange(p2000_coll, desc(count))
summary(full_corpus, n=5)
Corpus consisting of 4000 documents, showing 5 documents:
Text Types Tokens Sentences origin language group
kr_00007.txt 196 284 19 kr german activists
ikem_01141.txt 27 30 2 ikem german activists
fff_de_00121.txt 105 178 11 fff_de german activists
ikem_00898.txt 211 880 32 ikem german activists
ikem_00709.txt 20 43 3 ikem german activists
#create input
corp <- dfm(full_corpus)
Warnung: 'dfm.corpus()' is deprecated. Use 'tokens()' first.
# lemmatisierung versuchen:
de_klima <- dfm_select(corp, pattern ="klima*")
de_relfreq <- dfm_weight(de_klima, scheme="prop")
de_freqs <- textstat_frequency(de_relfreq, groups=group)
#plotting
freqs.act <- filter(de_freqs, group == "activists") %>% as.data.frame() %>% select(feature, frequency)
freqs.scept <- filter(de_freqs, group == "sceptics") %>% as.data.frame() %>% select(feature, frequency)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(frequency.x) %>% mutate(feature = factor(feature, feature))
ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=frequency.x, yend=frequency.y), color="grey") +
geom_point(aes(x=feature, y=frequency.x), color = "red", size = 3 ) +
geom_point(aes(x=feature, y=frequency.y), color = "lightblue", size = 3 ) +
ggtitle("Word Frequencies") +
xlab("") + ylab("Wortfrequenz") +
coord_flip()
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
freqs.act
freqs.scept
# to get lemmatized dfm
sp_pro2000_dfm <- as.tokens(sp_pro2000) %>%
dfm()
sp_con2000_dfm <- as.tokens(sp_contra2000) %>%
dfm()
de_klima_pro <- dfm_select(sp_pro2000_dfm, pattern ="klima*")
de_relfreq_pro <- dfm_weight(de_klima_pro, scheme="prop")
de_freqs_pro <- textstat_frequency(de_relfreq_pro)
de_klima_con <- dfm_select(sp_con2000_dfm, pattern ="klima*")
de_relfreq_con <- dfm_weight(de_klima_con, scheme="prop")
de_freqs_con <- textstat_frequency(de_relfreq_con)
#plotting
freqs.act <- filter(de_freqs_pro) %>% as.data.frame() %>% select(feature, frequency)
freqs.scept <- filter(de_freqs_con) %>% as.data.frame() %>% select(feature, frequency)
freqs <- left_join(freqs.act, freqs.scept, by = "feature") %>% head(30) %>% arrange(frequency.x) %>% mutate(feature = factor(feature, feature))
p <- ggplot(freqs) +
geom_segment(aes(x=feature, xend=feature, y=frequency.x, yend=frequency.y), color="grey") +
geom_point(aes(x=feature, y=frequency.x, colour="Activists"), size = 3) +
geom_point(aes(x=feature, y=frequency.y, colour="Sceptics"), size = 3 ) +
ggtitle("Word Frequencies") +
xlab("") + ylab("Frequency") +
coord_flip()
p+labs(colour="Group")
Warnung: Removed 1 rows containing missing values (geom_segment).
Warnung: Removed 1 rows containing missing values (geom_point).
dfm_weight_corp <- full_corpus %>%
tokens(remove_punct = TRUE) %>%
tokens_remove(de_stopwords1) %>%
dfm() %>%
dfm_weight(scheme = "prop")
# Calculate relative frequency by president
freq_weight <- textstat_frequency(dfm_weight_corp, n = 10,
groups = dfm_weight_corp$origin)
ggplot(data = freq_weight, aes(x = nrow(freq_weight):1, y = frequency)) +
geom_point() +
facet_wrap(~ group, scales = "free") +
coord_flip() +
scale_x_continuous(breaks = nrow(freq_weight):1,
labels = freq_weight$feature) +
labs(x = NULL, y = "Relative frequency")
summary(full_corpus, n=10)
Corpus consisting of 4000 documents, showing 10 documents:
Text Types Tokens Sentences origin language group
kr_00007.txt 196 284 19 kr german activists
ikem_01141.txt 27 30 2 ikem german activists
fff_de_00121.txt 105 178 11 fff_de german activists
ikem_00898.txt 211 880 32 ikem german activists
ikem_00709.txt 20 43 3 ikem german activists
ikem_00588.txt 210 605 11 ikem german activists
kr_00036.txt 415 695 30 kr german activists
ikem_00627.txt 354 1054 23 ikem german activists
ikem_00532.txt 277 530 29 ikem german activists
ikem_00518.txt 140 233 6 ikem german activists
full_corpus %>%
tokens(remove_punct = TRUE, remove_numbers=TRUE) %>%
tokens_remove(de_stopwords1) %>%
dfm() %>%
dfm_group(groups = group) %>%
dfm_trim(min_termfreq = 5, verbose = FALSE) %>%
textplot_wordcloud(comparison = TRUE, max_words=100)
#,color=c("lightblue","blue"))
textplot_xray(
kwic(tokens(pro2000), pattern = "klima*"),
kwic(tokens(contra2000), pattern = "klima*"))
library(tidytext)
Fehler in library(tidytext) : es gibt kein Paket namens ‘tidytext’
dfm_full <- dfm(full_corpus, remove=de_stopwords1, remove_punct=TRUE, remove_numbers=TRUE)
dfm_full
tm_full<-convert(dfm_full, to="topicmodels")
topicModel <- LDA(tm_full, k=5, method="Gibbs", control=list(iter = 500, verbose = 25))
terms(topicModel, 10)
Ideas: DONE: Wordcloud Plot of Comparison Group: https://quanteda.io/articles/pkgdown/examples/plotting.html
Lexical Dispersion Plot (X-Ray) -> could be done for keywords "klima*"
Next: - Calculate “Corpus Similarity” - DONE: Klimawörter Liste mit Group und Counts abspeichern -> welche Klimawörter gibt es wo und wie oft - Analyse der Ergebnisse - Literatur-Recherche zu den Textmining Themen
Topic Modeling https://www.tidytextmining.com/topicmodeling.html